{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模块导入"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import json\n",
    "from keras.utils import np_utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"data/train_clean.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 连续值特征数据处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def continuous_features(data):\n",
    "\tdef normalize_feature(df):\n",
    "\t\treturn df.apply(lambda column: (column - column.mean()) / column.std())\n",
    "\n",
    "\tcontinuous_train = data[[\"time\", \"resolution_ratio\"]]\n",
    "\tcontinuous_train = normalize_feature(continuous_train)\n",
    "\tcontinuous_train[\"ip\"] = data[\"ip\"]\n",
    "\tcontinuous_train = np.array(continuous_train)\n",
    "\n",
    "\treturn continuous_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "continuous_train = continuous_features(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000000, 3)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "continuous_train.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# one-hot特征数据处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def one_hot_features(data):\n",
    "\tdef make_one_hot(feature, one_hot_data):\n",
    "\t\ttry:\n",
    "\t\t\tn = len(index_json[feature])\n",
    "\t\texcept:\n",
    "\t\t\tn = int(data[feature].max())\n",
    "\n",
    "\t\treturn np_utils.to_categorical(one_hot_data, n + 1)\n",
    "\n",
    "\twith open(\"data/index_json.json\", \"r\") as f:\n",
    "\t\tindex_json = json.load(f)\n",
    "\n",
    "\tone_hot_set = {}\n",
    "\tfor i in [\"apptype\", \"dvctype\", \"ntt\", \"carrier\", \"orientation\", \"lan\"]:\n",
    "\t\tone_hot_set[i] = make_one_hot(i, np.uint8(data[i].values.reshape(len(data[i]), 1)))\n",
    "\n",
    "\tone_hot_train = np.hstack(tuple(one_hot_set.values()))\n",
    "\n",
    "\treturn one_hot_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "one_hot_train = one_hot_features(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000000, 111)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "one_hot_train.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# embedding特征数据处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def embedding_features(data):\n",
    "\tembedding_train = data[[\"pkgname\", \"adunitshowid\", \"mediashowid\", \"city\", \\\n",
    "\t\t\t\t\t\"adidmd5\", \"imeimd5\",\"openudidmd5\", \"macmd5\", \\\n",
    "\t\t\t\t\t\"model\", \"osv\"]]\n",
    "\tembedding_train = np.array(embedding_train)\n",
    "\n",
    "\treturn embedding_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding_train = embedding_features(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000000, 10)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embedding_train.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 整合"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y_train = np.array(data[\"label\"]).reshape(len(data[\"label\"]), 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000000, 1)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "unembedding_train = np.hstack((continuous_train, one_hot_train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000000, 114)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unembedding_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
